Library Imports

from pyspark.sql import SparkSession
from pyspark.sql import types as T

from pyspark.sql import functions as F

from datetime import datetime
from decimal import Decimal

Template

spark = (
    SparkSession.builder
    .master("local")
    .appName("Section 3.1 - One to Many Rows")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
)

sc = spark.sparkContext

import os

data_path = "/data/pets.csv"
base_path = os.path.dirname(os.getcwd())
path = base_path + data_path
pets = spark.read.csv(path, header=True)
pets.toPandas()
id breed_id nickname birthday age color
0 1 1 King 2014-11-22 12:30:31 5 brown
1 2 3 Argus 2016-11-22 10:05:10 10 None
2 3 1 Chewie 2016-11-22 10:05:10 15 None
3 3 2 Maple 2018-11-22 10:05:10 17 white
4 4 2 None 2019-01-01 10:05:10 13 None

One to Many Rows

Very commonly you might have a column where it is an array type and you want to flatten that array out into multiple rows? Well let's look at how we can do that and some practical applications of it.

Multiple People Interested in Babysitting the Same Pet

Case 1: Get a Table with just the People Interested

Question to answer:

We have a couple of people interested in our little friend named "King", we want to create a new dataset containing a single row for each people interested in King. How can I do this?

pets_2 = (
    pets
    .where(F.col('id') == 1)
    .withColumn(
        'people_interested',
        F.array([
            F.lit('John'),
            F.lit('Doe'),
            F.lit('Bob'),
            F.lit('Billy')
        ])
    )
)

pets_2.toPandas()
id breed_id nickname birthday age color people_interested
0 1 1 King 2014-11-22 12:30:31 5 brown [John, Doe, Bob, Billy]
(
    pets_2
    .withColumn('people_interested', F.explode(F.col('people_interested')))
    .toPandas()
)
id breed_id nickname birthday age color people_interested
0 1 1 King 2014-11-22 12:30:31 5 brown John
1 1 1 King 2014-11-22 12:30:31 5 brown Doe
2 1 1 King 2014-11-22 12:30:31 5 brown Bob
3 1 1 King 2014-11-22 12:30:31 5 brown Billy

What Happened?

  1. So we first created a column that contained a list of names of the people interested.
  2. Next we split the list so that there is a row per person interested.

Case 2: Get a Table with just the People Interested and the Number of Days

Question to answer:

We have a couple of people interested in our little friend named "King" and the number of days they would like to babysit for, we want to create a new dataset containing a single row for each people interested in King. How can I do this?

pets_2 = (
    pets
    .where(F.col('id') == 1)
    .withColumn(
        'people_interested',
        F.array([
            F.create_map([F.lit('John'), F.lit(5)]),
            F.create_map([F.lit('Doe'), F.lit(3)]),
            F.create_map([F.lit('Bob'), F.lit(7)]),
            F.create_map([F.lit('Billy'), F.lit(9)])
        ])
    )
)

pets_2.toPandas()
id breed_id nickname birthday age color people_interested
0 1 1 King 2014-11-22 12:30:31 5 brown [{u'John': 5}, {u'Doe': 3}, {u'Bob': 7}, {u'Bi...
(
    pets_2
    .withColumn('people_interested', F.explode(F.col('people_interested')))
    .select(
        "*", 
        F.explode('people_interested').alias('person', 'days')
    )
    .toPandas()
)
id breed_id nickname birthday age color people_interested person days
0 1 1 King 2014-11-22 12:30:31 5 brown {u'John': 5} John 5
1 1 1 King 2014-11-22 12:30:31 5 brown {u'Doe': 3} Doe 3
2 1 1 King 2014-11-22 12:30:31 5 brown {u'Bob': 7} Bob 7
3 1 1 King 2014-11-22 12:30:31 5 brown {u'Billy': 9} Billy 9

What Happened?

  1. So we first created a column that contained a list of dictionary mapping the name of the person interested with the number of days they were interested.
  2. Here we had to explode twice, first to get a row per person interested (table grew longer), then to split each field of the dictioanry to get the name of the person and the number of days (table grew wider).

Summary

  • We looked at some pretty complex transformations that involved using the explode function which decomposed an array/map object into multiple rows/columns.

results matching ""

    No results matching ""